from anndata import AnnData
import scrublet as scr
import pandas as pd
import scanpy as sc
import numpy as np
import rbcde
import scipy
import os
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.figdir = './figures-N1-endometrium-integration/'
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80) # low dpi (dots per inch) yields small inline figures
Import existing objects and rebuild raw form. There are some holes in their metadata, fill them up. Also, fix up some stuff that was flawed in the original metadata CSVs.
bdata1 = sc.read('../N1-cbtm/endometrium-N2-10x.h5ad')
bdata1 = AnnData(bdata1.raw.X, obs=bdata1.obs, var=bdata1.raw.var)
bdata2 = sc.read('../N2-biopsies/endometrium-N2-10x.h5ad')
bdata2 = AnnData(bdata2.raw.X, obs=bdata2.obs, var=bdata2.raw.var)
bdata1.obs['clinical'] = 'U'
bdata1.obs['day'] = 'U'
bdata1.obs['type'] = 'CBTM'
#this one sample has a T, the rest are C
bdata1.obs['treatment'] = ['T' if '4861STDY7771123' in i else 'C' for i in bdata1.obs_names]
#redo all EN to ENMY for bdata1
bdata1.obs['location'] = [str(i) for i in bdata1.obs['location']]
bdata1.obs.loc[bdata1.obs['location']=='EN', 'location'] = 'ENMY'
#biopsies use a more collapsed form of this
bdata1.obs['phase'] = ['P' if i=='proliferative' else 'S' for i in bdata1.obs['phase']]
#oh yeah and a few individuals were apparently misannotated in this category in the metadata
bdata1.obs.loc[[i in ['A13','A16'] for i in bdata1.obs['individual']], 'phase'] = 'P'
bdata2.obs['location'] = 'EN'
Merge the two objects, and store the raw form in .raw. Filter out unexpressed genes, recompute mitochondrial percentage, then obtain log(CPM/100 + 1) form of data and store that in .X.
adata = bdata1.concatenate(bdata2, index_unique=None)
adata.raw = adata.copy()
sc.pp.filter_genes(adata, min_cells=3)
#convert to lower to be species agnostic: human mito start with MT-, mouse with mt-
mito_genes = [name for name in adata.var_names if name.lower().startswith('mt-')]
# for each cell compute fraction of counts in mito genes vs. all genes
# the `.A1` is only necessary as X is sparse (to transform to a dense array after summing)
adata.obs['percent_mito'] = np.sum(
adata[:, mito_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
The rest of the analysis will be analogous to what was done for the CBTM/biopsy separately.
Remove cell cycle genes by inverting the object and clustering the gene space, reporting back the cluster with known cycling genes as members.
bdata = adata.copy()
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
bdata = bdata[:, bdata.var['highly_variable']]
bdata = bdata.copy().T
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack')
sc.pl.pca_variance_ratio(bdata, log=True, save='_ccg_identification.pdf')
num_pcs = 20
sc.pp.neighbors(bdata,n_pcs=num_pcs)
sc.tl.umap(bdata)
sc.tl.leiden(bdata, resolution=0.4)
bdata.obs['known_cyclers'] = [i in ['CDK1','MKI67','CCNB2','PCNA'] for i in bdata.obs_names]
sc.pl.umap(bdata,color=['leiden','known_cyclers'],color_map='OrRd',save='_ccg_identification.pdf')
print(bdata.obs.loc[['CDK1','MKI67','CCNB2','PCNA'],'leiden'])
With the cycling genes identified, clean up the plotting folder by moving all related plots to a subfolder.
def MovePlots(plotpattern, subplotdir):
if not os.path.exists(str(sc.settings.figdir)+subplotdir):
os.makedirs(str(sc.settings.figdir)+'/'+subplotdir)
os.system('mv '+str(sc.settings.figdir)+'/*'+plotpattern+'*.* '+str(sc.settings.figdir)+'/'+subplotdir)
MovePlots('ccg_identification','ccg_identification')
Flag cell cycling genes and continue standard pipeline in a helper object until PCA.
adata.var['is_ccg'] = [i in bdata.obs[bdata.obs['leiden']=='10'].index for i in adata.var_names]
#compute HVG and PCA on separate helper object and port the results back to the original one
bdata = adata[:, ~adata.var['is_ccg']]
sc.pp.highly_variable_genes(bdata, min_mean=0.0125, max_mean=3, min_disp=0.5)
for col in ['highly_variable','means', 'dispersions', 'dispersions_norm']:
adata.var[col] = bdata.var[col]
#fill NaNs with False so that subsetting to HVGs is possible
adata.var['highly_variable'].fillna(value=False, inplace=True)
bdata = bdata[:, bdata.var['highly_variable']]
sc.pp.scale(bdata, max_value=10)
sc.tl.pca(bdata, svd_solver='arpack', n_comps=50)
adata.obsm['X_pca'] = bdata.obsm['X_pca'].copy()
adata.uns['pca'] = bdata.uns['pca'].copy()
adata.varm['PCs'] = np.zeros(shape=(adata.n_vars, 50))
adata.varm['PCs'][adata.var['highly_variable']] = bdata.varm['PCs']
sc.pl.pca_variance_ratio(adata, log=True, save='.pdf')
Correct cross-individual batch effects with Harmony.
num_pcs = 20
pca = adata.obsm['X_pca'][:, :num_pcs]
batch = adata.obs['individual']
%load_ext rpy2.ipython
%%R -i pca -i batch -o hem
library(harmony)
library(magrittr)
#this gets "90% of the way" to determinism. better than nothing.
set.seed(1)
hem = HarmonyMatrix(pca, batch, theta=1, verbose=FALSE, do_pca=FALSE)
hem = data.frame(hem)
Compute clustering and UMAP, plot metadata, clustering and individual location UMAPs.
adata.obsm['X_harmony'] = hem.values
sc.pp.neighbors(adata, use_rep='X_harmony')
sc.tl.leiden(adata, resolution = 0.5)
sc.tl.umap(adata)
#need to set this up manually as the different metadata csvs are not fully compatible
plotmeta = ['individual','location','phase','clinical','day','type','treatment','sample']
sc.pl.umap(adata, color=plotmeta, save='.pdf')
sc.pl.umap(adata, color='leiden', save='_clustering.pdf')
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_clustering_clusnumbers.pdf')
Plot canonical markers for ease of annotation.
with open('markers.txt','r') as fid:
markers = np.unique([line.rstrip() for line in fid.readlines()])
#make sure they're in the dataset, and sort them alphabetically for ease of finding things
markers = sorted([item for item in markers if item in adata.var_names])
for i in np.arange(np.ceil(len(markers)/4)):
sc.pl.umap(adata, color=markers[int(i*4):int((i+1)*4)], use_raw=False, save='-markers'+str(int(i+1))+'.pdf', color_map='OrRd')
#goodbye clutter!
MovePlots('markers','markers')
Identify cluster markers via the rank-biserial correlation, an effect size equivalent of the Wilcoxon test. Threshold the coefficient with the literature critical value for a large effect (0.5), and possibly lower the stringency if any clusters have fewer than five markers. Write out resulting marker list to CSV.
rbcde.RBC(adata, use_raw=False)
plot_dict = {}
degs = pd.DataFrame(columns=['cluster','RBC'])
for thresh in [0.5,0.3,0.2,0.1]:
degs_temp, temp_dict = rbcde.filter_markers(adata, use_raw=False, thresh=thresh)
for clus in temp_dict:
if len(temp_dict[clus])>=5 and clus not in plot_dict:
plot_dict[clus] = temp_dict[clus]
degs = pd.concat([degs, degs_temp.loc[degs_temp['cluster']==clus,:]])
if set(plot_dict.keys()) == set(temp_dict.keys()):
break
degs.to_csv(str(sc.settings.figdir)+'/cluster_markers_rbc.csv')
Do dotplots (for top 100 in the interest of legibility, if need be) and more detailed exports.
adata_count = AnnData(np.expm1(adata.X), var=adata.var, obs=adata.obs)
for clus in plot_dict:
degs_clus = degs.loc[degs['cluster']==clus,:].copy()
degs_clus['log2_FC'] = 0
degs_clus['percent_cluster'] = 0
degs_clus['percent_rest'] = 0
adata_clus = adata_count[adata.obs['leiden']==clus]
adata_rest = adata_count[[not i for i in adata.obs['leiden']==clus]]
adata_clus = adata_clus[:, degs_clus.index]
adata_rest = adata_rest[:, degs_clus.index]
degs_clus['log2_FC'] = np.asarray(np.log2(np.mean(adata_clus.X,axis=0)/np.mean(adata_rest.X,axis=0))).reshape(-1)
#are you expressed?
adata_clus.X.data = adata_clus.X.data > 0
adata_rest.X.data = adata_rest.X.data > 0
degs_clus['percent_cluster'] = np.asarray(100*np.sum(adata_clus.X,axis=0)/adata_clus.shape[0]).reshape(-1)
degs_clus['percent_rest'] = np.asarray(100*np.sum(adata_rest.X,axis=0)/adata_rest.shape[0]).reshape(-1)
degs_clus.to_csv(str(sc.settings.figdir)+'/cluster_markers_'+clus+'.csv')
sc.pl.dotplot(adata, {clus: plot_dict[clus][:100]}, groupby='leiden', use_raw=False, save='_cluster_markers_'+clus+'.pdf')
#goodbye clutter!
MovePlots('cluster_markers','cluster_markers')
And with that, think we're good. Save the object for later.
adata.write('endometrium-N1-integration.h5ad')